{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Handling Missing Values " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import pandas \n", "import pandas as pd " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# read ufo data \n", "ufo = pd.read_csv(\"http://bit.ly/uforeports\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
18236Grant ParkNaNTRIANGLEIL12/31/2000 23:00
18237Spirit LakeNaNDISKIA12/31/2000 23:00
18238Eagle RiverNaNNaNWI12/31/2000 23:45
18239Eagle RiverREDLIGHTWI12/31/2000 23:45
18240YborNaNOVALFL12/31/2000 23:59
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 Grant Park NaN TRIANGLE IL 12/31/2000 23:00\n", "18237 Spirit Lake NaN DISK IA 12/31/2000 23:00\n", "18238 Eagle River NaN NaN WI 12/31/2000 23:45\n", "18239 Eagle River RED LIGHT WI 12/31/2000 23:45\n", "18240 Ybor NaN OVAL FL 12/31/2000 23:59" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# last 5 rows \n", "ufo.tail() " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
0FalseTrueFalseFalseFalse
1FalseTrueFalseFalseFalse
2FalseTrueFalseFalseFalse
3FalseTrueFalseFalseFalse
4FalseTrueFalseFalseFalse
..................
18236FalseTrueFalseFalseFalse
18237FalseTrueFalseFalseFalse
18238FalseTrueTrueFalseFalse
18239FalseFalseFalseFalseFalse
18240FalseTrueFalseFalseFalse
\n", "

18241 rows × 5 columns

\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 False True False False False\n", "1 False True False False False\n", "2 False True False False False\n", "3 False True False False False\n", "4 False True False False False\n", "... ... ... ... ... ...\n", "18236 False True False False False\n", "18237 False True False False False\n", "18238 False True True False False\n", "18239 False False False False False\n", "18240 False True False False False\n", "\n", "[18241 rows x 5 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check missing values \n", "ufo.isnull() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "__Note__\n", "1. True: Missing \n", "2. False: Not Missing" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
0TrueFalseTrueTrueTrue
1TrueFalseTrueTrueTrue
2TrueFalseTrueTrueTrue
3TrueFalseTrueTrueTrue
4TrueFalseTrueTrueTrue
..................
18236TrueFalseTrueTrueTrue
18237TrueFalseTrueTrueTrue
18238TrueFalseFalseTrueTrue
18239TrueTrueTrueTrueTrue
18240TrueFalseTrueTrueTrue
\n", "

18241 rows × 5 columns

\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 True False True True True\n", "1 True False True True True\n", "2 True False True True True\n", "3 True False True True True\n", "4 True False True True True\n", "... ... ... ... ... ...\n", "18236 True False True True True\n", "18237 True False True True True\n", "18238 True False False True True\n", "18239 True True True True True\n", "18240 True False True True True\n", "\n", "[18241 rows x 5 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# using notnull() \n", "ufo.notnull() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "__Note__\n", "1. axis = 0: Rows \n", "2. axis = 1: Columns " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "City 25\n", "Colors Reported 15359\n", "Shape Reported 2644\n", "State 0\n", "Time 0\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sum of missing values: by default axis = 0\n", "ufo.isnull().sum() " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's create a series \n", "pd.Series([True, False, True]).sum() " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
21NaNNaNNaNLA8/15/1943 0:00
22NaNNaNLIGHTLA8/15/1943 0:00
204NaNNaNDISKCA7/15/1952 12:30
241NaNBLUEDISKMT7/4/1953 14:00
613NaNNaNDISKNV7/1/1960 12:00
1877NaNYELLOWCIRCLEAZ8/15/1969 1:00
2013NaNNaNNaNNH8/1/1970 9:30
2546NaNNaNFIREBALLOH10/25/1973 23:30
3123NaNREDTRIANGLEWV11/25/1975 23:00
4736NaNNaNSPHERECA6/23/1982 23:00
5269NaNNaNNaNAZ6/30/1985 21:30
6735NaNNaNFORMATIONTX4/1/1992 2:00
7208NaNNaNCIRCLEMI10/4/1993 17:30
8828NaNNaNTRIANGLEWA10/30/1995 21:30
8967NaNNaNVARIOUSCA12/8/1995 18:00
9273NaNNaNTRIANGLEOH5/1/1996 3:00
9388NaNNaNOVALCA6/12/1996 12:00
9587NaNNaNEGGFL8/24/1996 15:00
10399NaNNaNTRIANGLEIL6/15/1997 23:00
11625NaNNaNCIRCLETX6/7/1998 7:00
12441NaNREDFIREBALLWA10/26/1998 17:58
15767NaNNaNRECTANGLENV1/21/2000 11:30
15812NaNNaNLIGHTNV2/2/2000 3:00
16054NaNGREENNaNFL3/11/2000 3:30
16608NaNNaNSPHERENY6/15/2000 15:00
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "21 NaN NaN NaN LA 8/15/1943 0:00\n", "22 NaN NaN LIGHT LA 8/15/1943 0:00\n", "204 NaN NaN DISK CA 7/15/1952 12:30\n", "241 NaN BLUE DISK MT 7/4/1953 14:00\n", "613 NaN NaN DISK NV 7/1/1960 12:00\n", "1877 NaN YELLOW CIRCLE AZ 8/15/1969 1:00\n", "2013 NaN NaN NaN NH 8/1/1970 9:30\n", "2546 NaN NaN FIREBALL OH 10/25/1973 23:30\n", "3123 NaN RED TRIANGLE WV 11/25/1975 23:00\n", "4736 NaN NaN SPHERE CA 6/23/1982 23:00\n", "5269 NaN NaN NaN AZ 6/30/1985 21:30\n", "6735 NaN NaN FORMATION TX 4/1/1992 2:00\n", "7208 NaN NaN CIRCLE MI 10/4/1993 17:30\n", "8828 NaN NaN TRIANGLE WA 10/30/1995 21:30\n", "8967 NaN NaN VARIOUS CA 12/8/1995 18:00\n", "9273 NaN NaN TRIANGLE OH 5/1/1996 3:00\n", "9388 NaN NaN OVAL CA 6/12/1996 12:00\n", "9587 NaN NaN EGG FL 8/24/1996 15:00\n", "10399 NaN NaN TRIANGLE IL 6/15/1997 23:00\n", "11625 NaN NaN CIRCLE TX 6/7/1998 7:00\n", "12441 NaN RED FIREBALL WA 10/26/1998 17:58\n", "15767 NaN NaN RECTANGLE NV 1/21/2000 11:30\n", "15812 NaN NaN LIGHT NV 2/2/2000 3:00\n", "16054 NaN GREEN NaN FL 3/11/2000 3:30\n", "16608 NaN NaN SPHERE NY 6/15/2000 15:00" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# filtering using isnull() \n", "ufo[ufo.City.isnull()]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "25" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check specific column\n", "ufo.City.isnull().sum() " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Drop Missing Values " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(18241, 5)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# shape \n", "ufo.shape " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2486, 5)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop missing: drop row contains missing values \n", "# it is inplace = False \n", "ufo.dropna(how='any').shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(18241, 5)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# how=all \n", "ufo.dropna(how='all').shape " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15576, 5)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# subset: any \n", "ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(18237, 5)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# subset: all \n", "ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filling Missing Values " ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LIGHT 2803\n", "DISK 2122\n", "TRIANGLE 1889\n", "OTHER 1402\n", "CIRCLE 1365\n", "SPHERE 1054\n", "FIREBALL 1039\n", "OVAL 845\n", "CIGAR 617\n", "FORMATION 434\n", "VARIOUS 333\n", "RECTANGLE 303\n", "CYLINDER 294\n", "CHEVRON 248\n", "DIAMOND 234\n", "EGG 197\n", "FLASH 188\n", "TEARDROP 119\n", "CONE 60\n", "CROSS 36\n", "DELTA 7\n", "ROUND 2\n", "CRESCENT 2\n", "PYRAMID 1\n", "HEXAGON 1\n", "DOME 1\n", "FLARE 1\n", "Name: Shape Reported, dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# value counts: by default drop = True \n", "ufo[\"Shape Reported\"].value_counts() " ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LIGHT 2803\n", "NaN 2644\n", "DISK 2122\n", "TRIANGLE 1889\n", "OTHER 1402\n", "CIRCLE 1365\n", "SPHERE 1054\n", "FIREBALL 1039\n", "OVAL 845\n", "CIGAR 617\n", "FORMATION 434\n", "VARIOUS 333\n", "RECTANGLE 303\n", "CYLINDER 294\n", "CHEVRON 248\n", "DIAMOND 234\n", "EGG 197\n", "FLASH 188\n", "TEARDROP 119\n", "CONE 60\n", "CROSS 36\n", "DELTA 7\n", "CRESCENT 2\n", "ROUND 2\n", "PYRAMID 1\n", "DOME 1\n", "FLARE 1\n", "HEXAGON 1\n", "Name: Shape Reported, dtype: int64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# value counts: false \n", "ufo[\"Shape Reported\"].value_counts(dropna=False) " ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# fillna() \n", "ufo[\"Shape Reported\"].fillna(value=\"VARIOUS\", inplace=True)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "VARIOUS 2977\n", "LIGHT 2803\n", "DISK 2122\n", "TRIANGLE 1889\n", "OTHER 1402\n", "CIRCLE 1365\n", "SPHERE 1054\n", "FIREBALL 1039\n", "OVAL 845\n", "CIGAR 617\n", "FORMATION 434\n", "RECTANGLE 303\n", "CYLINDER 294\n", "CHEVRON 248\n", "DIAMOND 234\n", "EGG 197\n", "FLASH 188\n", "TEARDROP 119\n", "CONE 60\n", "CROSS 36\n", "DELTA 7\n", "CRESCENT 2\n", "ROUND 2\n", "PYRAMID 1\n", "HEXAGON 1\n", "DOME 1\n", "FLARE 1\n", "Name: Shape Reported, dtype: int64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# now take a look \n", "ufo[\"Shape Reported\"].value_counts() " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }